TL;DR

We add cell type information in X and look at W.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filter <- apply(assay(allen_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 11545

To speed up the computations, I will focus on the top 1,000 most variable genes.

allen_core <- allen_core[filter,]
core <- assay(allen_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        PC1         PC2 detection_rate   coverage
## PC1             1.00000000 -0.01052424      0.8066812  0.4453172
## PC2            -0.01052424  1.00000000     -0.3601716 -0.3286485
## detection_rate  0.80668124 -0.36017158      1.0000000  0.5275845
## coverage        0.44531717 -0.32864852      0.5275845  1.0000000

X with cell types, common disp

print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2,
                                    X=model.matrix(~bio))))
##    user  system elapsed 
## 138.696  31.569  74.610

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_X@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_X@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2, horiz=T)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu    gamma_pi
## W1              1.00000000  0.06243371 -0.14284292  0.03336945
## W2              0.06243371  1.00000000  0.04833728  0.22845102
## gamma_mu       -0.14284292  0.04833728  1.00000000 -0.25597360
## gamma_pi        0.03336945  0.22845102 -0.25597360  1.00000000
## detection_rate -0.04056153 -0.26312871  0.29942803 -0.99261966
## coverage        0.04966279 -0.07480962  0.87261478 -0.49206198
##                detection_rate    coverage
## W1                -0.04056153  0.04966279
## W2                -0.26312871 -0.07480962
## gamma_mu           0.29942803  0.87261478
## gamma_pi          -0.99261966 -0.49206198
## detection_rate     1.00000000  0.52758451
## coverage           0.52758451  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_X)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_X))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(3, 2))
boxplot(zinb_X@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[3,], main="Beta_Pi")
abline(h=0)

par(mfrow=c(2, 2))
plot(t(zinb_X@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_X@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

X with cell types, diff disp

print(system.time(zinb_Xdisp <- zinbFit(core, ncores = 3, K = 2,
                                    X=model.matrix(~bio),commondispersion=F)))
##    user  system elapsed 
## 418.942 220.293 247.063

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_Xdisp@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_Xdisp@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2,horiz = T)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Xdisp@W[,1], W2=zinb_Xdisp@W[,2], gamma_mu = zinb_Xdisp@gamma_mu[1,], gamma_pi = zinb_Xdisp@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu    gamma_pi
## W1              1.00000000  0.15563821 -0.12603586  0.02071153
## W2              0.15563821  1.00000000  0.09032643  0.25316448
## gamma_mu       -0.12603586  0.09032643  1.00000000 -0.22512973
## gamma_pi        0.02071153  0.25316448 -0.22512973  1.00000000
## detection_rate -0.02162210 -0.28934921  0.27311190 -0.98961610
## coverage        0.06296143 -0.08869507  0.86093195 -0.48543702
##                detection_rate    coverage
## W1                 -0.0216221  0.06296143
## W2                 -0.2893492 -0.08869507
## gamma_mu            0.2731119  0.86093195
## gamma_pi           -0.9896161 -0.48543702
## detection_rate      1.0000000  0.52758451
## coverage            0.5275845  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Xdisp)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Xdisp))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(3, 2))
boxplot(zinb_Xdisp@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_Xdisp@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_Xdisp@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_Xdisp@beta_pi[3,], main="Beta_Pi")
abline(h=0)

par(mfrow=c(1, 2))
plot(t(zinb_Xdisp@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Xdisp@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

X with cell types, no V intercept

print(system.time(zinb_XnoneV <- zinbFit(core, ncores = 3, K = 2,
                                    X=model.matrix(~bio),
                                    V=matrix(0, ncol=1, nrow=NROW(core)))))
##    user  system elapsed 
## 301.229  63.112 147.607

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_XnoneV@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_XnoneV@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottom", levels(cl), fill=cols2, horiz=T)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_XnoneV@W[,1], W2=zinb_XnoneV@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2 detection_rate   coverage
## W1              1.00000000 -0.08867641    -0.02791175 -0.1674994
## W2             -0.08867641  1.00000000     0.85434291  0.7717366
## detection_rate -0.02791175  0.85434291     1.00000000  0.5275845
## coverage       -0.16749936  0.77173664     0.52758451  1.0000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_XnoneV)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_XnoneV))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(3, 2))
boxplot(zinb_XnoneV@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_XnoneV@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_XnoneV@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_XnoneV@beta_pi[3,], main="Beta_Pi")
abline(h=0)

par(mfrow=c(2, 2))
plot(t(zinb_XnoneV@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_XnoneV@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")